{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "source": [
    "import pandas as pd\r\n",
    "import numpy as np\r\n",
    "from sklearn.metrics import accuracy_score\r\n",
    "from sklearn.ensemble import RandomForestClassifier\r\n",
    "from sklearn.decomposition import PCA\r\n",
    "from sklearn.model_selection import train_test_split"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "source": [
    "data = pd.read_csv('Data/Phishing_Legitimate_full.csv')\r\n",
    "data.head(10)"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NumDots</th>\n",
       "      <th>SubdomainLevel</th>\n",
       "      <th>PathLevel</th>\n",
       "      <th>UrlLength</th>\n",
       "      <th>NumDash</th>\n",
       "      <th>NumDashInHostname</th>\n",
       "      <th>AtSymbol</th>\n",
       "      <th>TildeSymbol</th>\n",
       "      <th>NumUnderscore</th>\n",
       "      <th>NumPercent</th>\n",
       "      <th>...</th>\n",
       "      <th>IframeOrFrame</th>\n",
       "      <th>MissingTitle</th>\n",
       "      <th>ImagesOnlyInForm</th>\n",
       "      <th>SubdomainLevelRT</th>\n",
       "      <th>UrlLengthRT</th>\n",
       "      <th>PctExtResourceUrlsRT</th>\n",
       "      <th>AbnormalExtFormActionR</th>\n",
       "      <th>ExtMetaScriptLinkRT</th>\n",
       "      <th>PctExtNullSelfRedirectHyperlinksRT</th>\n",
       "      <th>CLASS_LABEL</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>72</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>144</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>58</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>79</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>46</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>42</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>60</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>8</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>76</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>46</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 49 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   NumDots  SubdomainLevel  PathLevel  UrlLength  NumDash  NumDashInHostname  \\\n",
       "0        3               1          5         72        0                  0   \n",
       "1        3               1          3        144        0                  0   \n",
       "2        3               1          2         58        0                  0   \n",
       "3        3               1          6         79        1                  0   \n",
       "4        3               0          4         46        0                  0   \n",
       "5        3               1          1         42        1                  0   \n",
       "6        2               0          5         60        0                  0   \n",
       "7        1               0          3         30        0                  0   \n",
       "8        8               7          2         76        1                  1   \n",
       "9        2               0          2         46        0                  0   \n",
       "\n",
       "   AtSymbol  TildeSymbol  NumUnderscore  NumPercent  ...  IframeOrFrame  \\\n",
       "0         0            0              0           0  ...              0   \n",
       "1         0            0              2           0  ...              0   \n",
       "2         0            0              0           0  ...              0   \n",
       "3         0            0              0           0  ...              0   \n",
       "4         0            0              0           0  ...              1   \n",
       "5         0            0              0           0  ...              1   \n",
       "6         0            0              0           0  ...              0   \n",
       "7         0            0              0           0  ...              0   \n",
       "8         0            0              0           0  ...              0   \n",
       "9         0            0              0           0  ...              0   \n",
       "\n",
       "   MissingTitle  ImagesOnlyInForm  SubdomainLevelRT  UrlLengthRT  \\\n",
       "0             0                 1                 1            0   \n",
       "1             0                 0                 1           -1   \n",
       "2             0                 0                 1            0   \n",
       "3             0                 0                 1           -1   \n",
       "4             0                 0                 1            1   \n",
       "5             1                 0                 1            1   \n",
       "6             0                 0                 1            0   \n",
       "7             0                 0                 1            1   \n",
       "8             0                 0                -1           -1   \n",
       "9             0                 0                 1            1   \n",
       "\n",
       "   PctExtResourceUrlsRT  AbnormalExtFormActionR  ExtMetaScriptLinkRT  \\\n",
       "0                     1                       1                   -1   \n",
       "1                     1                       1                    1   \n",
       "2                    -1                       1                   -1   \n",
       "3                     1                       1                    1   \n",
       "4                    -1                       0                   -1   \n",
       "5                     1                       1                   -1   \n",
       "6                     1                       1                   -1   \n",
       "7                     1                       1                    1   \n",
       "8                     1                       1                    1   \n",
       "9                     1                       1                   -1   \n",
       "\n",
       "   PctExtNullSelfRedirectHyperlinksRT  CLASS_LABEL  \n",
       "0                                   1            1  \n",
       "1                                   1            1  \n",
       "2                                   0            1  \n",
       "3                                  -1            1  \n",
       "4                                  -1            1  \n",
       "5                                   1            1  \n",
       "6                                  -1            1  \n",
       "7                                   1            1  \n",
       "8                                  -1            1  \n",
       "9                                  -1            1  \n",
       "\n",
       "[10 rows x 49 columns]"
      ]
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "source": [
    "data.columns"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "Index(['NumDots', 'SubdomainLevel', 'PathLevel', 'UrlLength', 'NumDash',\n",
       "       'NumDashInHostname', 'AtSymbol', 'TildeSymbol', 'NumUnderscore',\n",
       "       'NumPercent', 'NumQueryComponents', 'NumAmpersand', 'NumHash',\n",
       "       'NumNumericChars', 'NoHttps', 'RandomString', 'IpAddress',\n",
       "       'DomainInSubdomains', 'DomainInPaths', 'HttpsInHostname',\n",
       "       'HostnameLength', 'PathLength', 'QueryLength', 'DoubleSlashInPath',\n",
       "       'NumSensitiveWords', 'EmbeddedBrandName', 'PctExtHyperlinks',\n",
       "       'PctExtResourceUrls', 'ExtFavicon', 'InsecureForms',\n",
       "       'RelativeFormAction', 'ExtFormAction', 'AbnormalFormAction',\n",
       "       'PctNullSelfRedirectHyperlinks', 'FrequentDomainNameMismatch',\n",
       "       'FakeLinkInStatusBar', 'RightClickDisabled', 'PopUpWindow',\n",
       "       'SubmitInfoToEmail', 'IframeOrFrame', 'MissingTitle',\n",
       "       'ImagesOnlyInForm', 'SubdomainLevelRT', 'UrlLengthRT',\n",
       "       'PctExtResourceUrlsRT', 'AbnormalExtFormActionR', 'ExtMetaScriptLinkRT',\n",
       "       'PctExtNullSelfRedirectHyperlinksRT', 'CLASS_LABEL'],\n",
       "      dtype='object')"
      ]
     },
     "metadata": {},
     "execution_count": 3
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "source": [
    "y = data.pop('CLASS_LABEL')\r\n",
    "X = data"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)\r\n",
    "X_train"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NumDots</th>\n",
       "      <th>SubdomainLevel</th>\n",
       "      <th>PathLevel</th>\n",
       "      <th>UrlLength</th>\n",
       "      <th>NumDash</th>\n",
       "      <th>NumDashInHostname</th>\n",
       "      <th>AtSymbol</th>\n",
       "      <th>TildeSymbol</th>\n",
       "      <th>NumUnderscore</th>\n",
       "      <th>NumPercent</th>\n",
       "      <th>...</th>\n",
       "      <th>SubmitInfoToEmail</th>\n",
       "      <th>IframeOrFrame</th>\n",
       "      <th>MissingTitle</th>\n",
       "      <th>ImagesOnlyInForm</th>\n",
       "      <th>SubdomainLevelRT</th>\n",
       "      <th>UrlLengthRT</th>\n",
       "      <th>PctExtResourceUrlsRT</th>\n",
       "      <th>AbnormalExtFormActionR</th>\n",
       "      <th>ExtMetaScriptLinkRT</th>\n",
       "      <th>PctExtNullSelfRedirectHyperlinksRT</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>5525</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>109</td>\n",
       "      <td>11</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5034</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>15</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4709</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>56</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4218</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>62</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>917</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>664</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>45</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7540</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>62</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7221</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>77</td>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1318</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>49</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8915</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>46</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8000 rows × 48 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      NumDots  SubdomainLevel  PathLevel  UrlLength  NumDash  \\\n",
       "5525        1               0          7        109       11   \n",
       "5034        1               0          0         15        0   \n",
       "4709        2               0          4         56        1   \n",
       "4218        2               0          4         62        2   \n",
       "917         1               0          2         25        1   \n",
       "...       ...             ...        ...        ...      ...   \n",
       "664         2               0          3         45        0   \n",
       "7540        3               1          2         62        0   \n",
       "7221        3               0          3         77        4   \n",
       "1318        2               0          3         49        0   \n",
       "8915        1               0          4         46        1   \n",
       "\n",
       "      NumDashInHostname  AtSymbol  TildeSymbol  NumUnderscore  NumPercent  \\\n",
       "5525                  0         0            0              0           0   \n",
       "5034                  0         0            0              0           0   \n",
       "4709                  0         0            0              0           0   \n",
       "4218                  0         0            0              1           0   \n",
       "917                   1         0            0              0           0   \n",
       "...                 ...       ...          ...            ...         ...   \n",
       "664                   0         0            0              0           0   \n",
       "7540                  0         0            0              0           0   \n",
       "7221                  0         0            0              0           0   \n",
       "1318                  0         0            0              0           0   \n",
       "8915                  0         0            0              0           0   \n",
       "\n",
       "      ...  SubmitInfoToEmail  IframeOrFrame  MissingTitle  ImagesOnlyInForm  \\\n",
       "5525  ...                  1              0             0                 0   \n",
       "5034  ...                  0              0             0                 0   \n",
       "4709  ...                  0              0             0                 0   \n",
       "4218  ...                  0              0             0                 0   \n",
       "917   ...                  0              0             0                 0   \n",
       "...   ...                ...            ...           ...               ...   \n",
       "664   ...                  0              0             0                 0   \n",
       "7540  ...                  0              0             0                 0   \n",
       "7221  ...                  0              0             0                 0   \n",
       "1318  ...                  0              1             0                 0   \n",
       "8915  ...                  0              1             0                 0   \n",
       "\n",
       "      SubdomainLevelRT  UrlLengthRT  PctExtResourceUrlsRT  \\\n",
       "5525                 1           -1                     1   \n",
       "5034                 1            1                     1   \n",
       "4709                 1            0                    -1   \n",
       "4218                 1            0                     1   \n",
       "917                  1            1                     1   \n",
       "...                ...          ...                   ...   \n",
       "664                  1            1                     1   \n",
       "7540                 1            0                     1   \n",
       "7221                 1           -1                     1   \n",
       "1318                 1            1                    -1   \n",
       "8915                 1            1                     1   \n",
       "\n",
       "      AbnormalExtFormActionR  ExtMetaScriptLinkRT  \\\n",
       "5525                       1                    0   \n",
       "5034                       1                    0   \n",
       "4709                       1                   -1   \n",
       "4218                       1                    1   \n",
       "917                        1                   -1   \n",
       "...                      ...                  ...   \n",
       "664                        1                    1   \n",
       "7540                       1                    0   \n",
       "7221                      -1                    0   \n",
       "1318                       1                    1   \n",
       "8915                       0                    0   \n",
       "\n",
       "      PctExtNullSelfRedirectHyperlinksRT  \n",
       "5525                                   1  \n",
       "5034                                   1  \n",
       "4709                                  -1  \n",
       "4218                                   1  \n",
       "917                                    1  \n",
       "...                                  ...  \n",
       "664                                    1  \n",
       "7540                                   1  \n",
       "7221                                   1  \n",
       "1318                                  -1  \n",
       "8915                                   1  \n",
       "\n",
       "[8000 rows x 48 columns]"
      ]
     },
     "metadata": {},
     "execution_count": 5
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "source": [
    "clf = RandomForestClassifier(n_estimators=80, max_depth=85)\r\n",
    "clf.fit(X_train, y_train)"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "RandomForestClassifier(max_depth=85, n_estimators=80)"
      ]
     },
     "metadata": {},
     "execution_count": 6
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "source": [
    "label_predict = clf.predict(X_test)\r\n",
    "label_predict"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([1, 1, 0, ..., 0, 0, 0], dtype=int64)"
      ]
     },
     "metadata": {},
     "execution_count": 7
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "source": [
    "accuracy_score(y_test, label_predict)"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "0.979"
      ]
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "source": [
    "pca = PCA(n_components=5).fit(X_train)\r\n",
    "pca_data_train = pca.transform(X_train)\r\n",
    "pca_data_test = pca.transform(X_test)\r\n",
    "pca_data_train"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "array([[ 51.36312039, -43.10040097,  -8.49056064,  -3.09477374,\n",
       "          4.35845866],\n",
       "       [-63.26243107,  17.10688192, -12.09780016,   4.45628714,\n",
       "          1.92297625],\n",
       "       [-17.3350226 ,  -3.34468901,  -0.77832508,  -2.9569746 ,\n",
       "         -0.85164605],\n",
       "       ...,\n",
       "       [  8.89291272, -16.84360959,  -0.24470424,  -1.53134035,\n",
       "          1.2075523 ],\n",
       "       [-26.76251498,   2.88446423,   1.83814406,  -1.77963496,\n",
       "         -0.87197944],\n",
       "       [-25.74803349,  -0.35930416,  -7.20350265,   8.45745626,\n",
       "          0.49984096]])"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "source": [
    "clf = RandomForestClassifier(max_depth=10)\r\n",
    "clf.fit(pca_data_train, y_train)\r\n",
    "label_predict = clf.predict(pca_data_test)\r\n",
    "accuracy_score(y_test, label_predict)"
   ],
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "0.84"
      ]
     },
     "metadata": {},
     "execution_count": 10
    }
   ],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.8.11",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.8.11 64-bit ('ml-networking': conda)"
  },
  "interpreter": {
   "hash": "20bebc860e821dfc8c3402aa900049950a23cadb8eca4814f2eb319ccf2768b5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}